# install packages
# install.packages("plyr")
# install.packages("dplyr")
# install.packages("ggplot2")
# install.packages("ggmap")
# install.packages("RColorBrewer")

# declare libraries
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
library(RColorBrewer)

# read in citibike data
citiBike <- read.csv("201511-citibike-tripdata.csv")

# function that creates a vector of gradient colors based on a size input and vector of colors
gradient <- function(n, colors){
  colfunc <- colorRampPalette(colors)
  return(colfunc(n))
}

Patterns in Ride History Data

Asymmetric Traffic

Key question: Which stations see the most asymmetric traffic (more arrivals than departures and vice versa)?

# Get counts of number of times each station was used as an end station and start station.
endStation<-as.data.frame(table(citiBike$end.station.id))
startStation<-as.data.frame(table(citiBike$start.station.id))

# Merge the two data frames into one
stationData <- merge(startStation, endStation, by="Var1", all=TRUE)

# Set stations that dont have a count (an NA value) to 0
stationData[is.na(stationData)] <- 0

# Rename the columns 
names(stationData) <- c("station.id", "start.freq", "end.freq")

# Calculate incoming and outgoing traffic
stationData$inTraffic <- stationData$start.freq - stationData$end.freq
stationData$outTraffic <- stationData$end.freq - stationData$start.freq

The stations with the most outgoing traffic are:

# Order the stations by the most outgoing traffic and graph
outTraffic <- stationData[order(-stationData$outTraffic),]
barplot(outTraffic$outTraffic[outTraffic$outTraffic > 300],
        names.arg = outTraffic$station.id[outTraffic$outTraffic > 300], ylim = c(0, 1000),
        col = gradient(12, c("darkgreen", "palegreen3", "palegreen2")), 
        density = 90, border = TRUE, las = 2, main="Bike Stations with Highest Surplus",  
        xlab = "Station IDs", ylab = "Surplus of Bikes (Arrivals - Departures)",
        cex.axis = 0.75,  cex.names = 0.75,  cex.lab = 0.9)

The stations with the most incoming traffic are:

# Order the stations by the most incoming traffic and graph
inTraffic <- stationData[order(-stationData$inTraffic),]
barplot(inTraffic$inTraffic[inTraffic$inTraffic > 300],
        names.arg = inTraffic$station.id[inTraffic$inTraffic > 300],
        ylim = c(0, 2500), col = gradient(11, c("darkred", "red", "darkorange", "orange")), 
        density = 90, border = TRUE, las = 2, main="Bike Stations with Highest Deficit",
        xlab = "Station IDs", ylab = "Deficit of Bikes (Departures - Arrivals)",
        cex.axis = 0.75,  cex.names = 0.75,  cex.lab = 0.9)

We used barplots to show which CitiBike stations had the highest net inflow and highest net outflow of bikes over the month of November 2015. We can see that six stations had a net outflow of over 600 bikes and five stations had a net inflow of over 500 bikes over the course of the month. Clearly certain stations are not getting similar numbers of incoming and outgoing bikes, and with this information, CitiBike can track the number of bikes at stations with high numbers of outgoing bikes more closely to ensure that they are always stocked with bikes taken from those left at stations with high numbers of incoming bikes.

Longest Rides and Times of Day

Which stations originate the longest rides? Does this change as we go through different times of the day?

# Calculate the mean ride duration for all stations, listed them by start.station.id
longestRide <- aggregate(citiBike[, 1], list(citiBike$start.station.id), mean)
names(longestRide) <- c("station.id", "mean.trip.duration")

# Merge into dataframe from part 1
stationData <- merge(stationData, longestRide, by="station.id", all=TRUE)
stationData[is.na(stationData)] <- 0

# Order data by trip duration and graph stations with rides of average duration of over 2000
longestRide <- stationData[order(-stationData$mean.trip.duration),]
barplot(longestRide$mean.trip.duration[longestRide$mean.trip.duration > 2000], main = "Stations that Originate the Longest Rides", names.arg = longestRide$station.id[longestRide$mean.trip.duration > 2000], xlab = "Station IDs", ylab = "Ride Duration")

# Determine what time of day each ride started in: early morning, morning, afternoon, evening, or late evening
citiBike$time.slot = strtoi(format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBike$time.slot.category[citiBike$time.slot < 8] = "earlyMorning"
citiBike$time.slot.category[citiBike$time.slot < 12 & citiBike$time.slot >= 8] <- "morning"
citiBike$time.slot.category[citiBike$time.slot < 16 & citiBike$time.slot >= 12] <- "afternoon"
citiBike$time.slot.category[citiBike$time.slot < 20 & citiBike$time.slot >= 16] <- "evening"
citiBike$time.slot.category[citiBike$time.slot >= 20] <- "lateEvening"

# Calculate mean duration for each station and time of day combination
longest.ride.category <- aggregate(citiBike[, 1], list(citiBike$start.station.id, citiBike$time.slot.category), mean)
names(longest.ride.category) <- c("station.id", "time.slot", "mean")
longest.ride.category <- longest.ride.category[order(-longest.ride.category$mean),]

# Create graphs for each of the time of days
early.morning.ride = longest.ride.category[longest.ride.category$time.slot == "earlyMorning",]
barplot(early.morning.ride$mean[early.morning.ride$mean > 2500], names.arg = early.morning.ride$station.id[early.morning.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Early Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")

morning.ride = longest.ride.category[longest.ride.category$time.slot == "morning",]
barplot(morning.ride$mean[morning.ride$mean > 2500], names.arg = morning.ride$station.id[morning.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")

afternoon.ride = longest.ride.category[longest.ride.category$time.slot == "afternoon",]
barplot(afternoon.ride$mean[afternoon.ride$mean > 3000], names.arg = afternoon.ride$station.id[afternoon.ride$mean > 3000], main="Stations that Originate the Longest Rides in the Afternoon", xlab = "Station IDs", ylab = "Mean Ride Duration")

evening.ride = longest.ride.category[longest.ride.category$time.slot == "evening",]
barplot(evening.ride$mean[evening.ride$mean > 2500], names.arg = evening.ride$station.id[evening.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")

late.evening.ride = longest.ride.category[longest.ride.category$time.slot == "lateEvening",]
barplot(late.evening.ride$mean[late.evening.ride$mean > 3000], names.arg = late.evening.ride$station.id[late.evening.ride$mean > 3000], main = "Stations that Originate the Longest Rides in the Late Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")

With this data, CitiBike can see which stations originate the longest rides and when. They can then make sure to rotate less frequently used bikes to these stations and move frequently used bikes to stations that originate shorter rides. By doing so, CitiBike can keep the wear and tear on individual bikes relatively even and keep maintenance costs down.

Dataset Visualization

# change

# ===============================================================================
# ===============================[calculations]==================================
# ===============================================================================

# =============================[most used routes]================================

# function returns a unique integer from two integer inputs
cantor_pairing_function <- function(a, b){
  return(0.5 * (a + b) * (a + b + 1) + b)
}

# get all routes taken
routes <- subset(citiBike, select = c(start.station.id, end.station.id, start.station.latitude, start.station.longitude, end.station.latitude, end.station.longitude, usertype))
colnames(routes) <- c("start.id", "end.id", "start.lat", "start.lon", "end.lat", "end.lon", "user")

# apply unique route.id based on the cantor pairing function, with start.id and end.id as inputs
routes$route.id = cantor_pairing_function(routes$start.id, routes$end.id)

# seperate by users
subscriber_routes <- subset(routes, user == "Subscriber")
customer_routes <- subset(routes, user == "Customer")
routes <- subset(routes, select = -c(user))

# count # of times a route has been used
route_counts <- routes %>%
  count(route.id)
colnames(route_counts) <- c("route.id", "route.count")

# get only unique routes
unique_routes <- unique(routes)

# merge route count with ids, lats, and longitudes
route_counts <- join(unique_routes, route_counts, by = "route.id", type = "full", match = "first")
route_counts <- subset(route_counts, select = -c(route.id))

# sort by highest count 
route_counts <- route_counts[with(route_counts, order(-route.count)), ]

# =============================[route usage by user]============================

# count # of times a route has been used for subscribers
subscriber_routes <- subscriber_routes %>%
  count(route.id)
colnames(subscriber_routes) <- c("route.id", "route.count")
subscriber_routes <- join(unique_routes, subscriber_routes, by = "route.id", type = "full", match = "first")
subscriber_routes <- subset(subscriber_routes, select = -c(route.id))
subscriber_routes <- subscriber_routes[with(subscriber_routes, order(-route.count)), ]

# count # of times a route has been used for customers
customer_routes <- customer_routes %>%
  count(route.id)
colnames(customer_routes) <- c("route.id", "route.count")
customer_routes <- join(unique_routes, customer_routes, by = "route.id", type = "full", match = "first")
customer_routes <- subset(customer_routes, select = -c(route.id))
customer_routes <- customer_routes[with(customer_routes, order(-route.count)), ]

# =============================[station surplus]================================

# get arrivals to station
arrival_counts <- citiBike %>%
  count(end.station.id)
colnames(arrival_counts) <- c("id", "arr")

# get departures from station
departure_counts <- citiBike %>%
  count(start.station.id)
colnames(departure_counts) <- c("id", "dep")

# get unique stations
stations <- unique(subset(citiBike, select = c(start.station.id, start.station.latitude, start.station.longitude)))
colnames(stations) <- c("id", "lat", "lon")

# merge arrivals and departures with list of stations
stations <- join(stations, arrival_counts, by = "id", type = "full", match = "first")
stations <- join(stations, departure_counts, by = "id", type = "full", match = "first")

# calculate surplus of station
stations$surplus = stations$arr - stations$dep

# seperate surplus and deficit stations
stations.surplus <- subset(stations, surplus >= 0)
stations.deficit <- subset(stations, surplus <= 0)


# ===============================================================================
# ==================================[mapping]====================================
# ===============================================================================

min_lat <- min(c(min(citiBike$start.station.latitude), min(citiBike$end.station.latitude))) + 0.025
max_lat <- max(c(max(citiBike$start.station.latitude), max(citiBike$end.station.latitude))) + 0.0055
min_lon <- min(c(min(citiBike$start.station.longitude), min(citiBike$end.station.longitude))) + 0.029
max_lon <- max(c(max(citiBike$start.station.longitude), max(citiBike$end.station.longitude))) + 0.005

nyc <- c(left = min_lon, bottom = min_lat, right = max_lon, top = max_lat)
basemap <- get_map(nyc, zoom = 13, maptype = "terrain", source = "stamen")
## Map from URL : http://tile.stamen.com/terrain/13/2411/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3081.jpg
# =============================[most used routes]================================

# declare variables
elements <- 20
origin <- c()
dest <- c()

# get routes
for(i in 1:elements){
  origin <- append(origin, paste(route_counts$start.lat[i], route_counts$start.lon[i], sep=","))
  dest <- append(dest, paste(route_counts$end.lat[i], route_counts$end.lon[i], sep=","))
}

# initialize routes map
map.routes <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))

# build routes map
for(i in 1:(elements)){
  k <- (i - 1) / elements
  map.routes <- map.routes +
    geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
              color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
  # to deal with Google API request limit
  Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.7454973,-74.00197139&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.74691959,-74.00451887&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71921585,-73.9424469&destination=40.715143,-73.944507&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74232744,-73.95411749&destination=40.74731,-73.95451&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.74096374,-73.98602213&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7612274,-73.96094022&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77536905,-73.94803392&destination=40.77801203,-73.95407149&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.715143,-73.944507&destination=40.71921585,-73.9424469&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73028666,-73.9907647&destination=40.72740794,-73.98142006&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75640548,-73.9900262&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.711512,-74.015756&destination=40.701907,-74.013942&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75019995,-73.99093085&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76500525,-73.95818491&destination=40.76440023,-73.96648977&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72456089,-73.99565293&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74734825,-73.99723551&destination=40.75066386,-74.00176802&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes

# =============================[most used routes by subscribers]================================

# declare variables
elements <- 20
origin <- c()
dest <- c()

# get routes
for(i in 1:elements){
  origin <- append(origin, paste(subscriber_routes$start.lat[i], subscriber_routes$start.lon[i], sep=","))
  dest <- append(dest, paste(subscriber_routes$end.lat[i], subscriber_routes$end.lon[i], sep=","))
}

# initialize routes map
map.routes.subscribers <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))

# build routes map
for(i in 1:(elements)){
  k <- (i - 1) / elements
  map.routes.subscribers <- map.routes.subscribers +
    geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
              color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
  # to deal with Google API request limit
  Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.7454973,-74.00197139&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71921585,-73.9424469&destination=40.715143,-73.944507&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.74691959,-74.00451887&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74232744,-73.95411749&destination=40.74731,-73.95451&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.74096374,-73.98602213&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7612274,-73.96094022&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77536905,-73.94803392&destination=40.77801203,-73.95407149&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.715143,-73.944507&destination=40.71921585,-73.9424469&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75640548,-73.9900262&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73028666,-73.9907647&destination=40.72740794,-73.98142006&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75019995,-73.99093085&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74734825,-73.99723551&destination=40.75066386,-74.00176802&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76500525,-73.95818491&destination=40.76440023,-73.96648977&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74731,-73.95451&destination=40.74232744,-73.95411749&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71745169,-73.95850939&destination=40.716887,-73.963198&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.70834698,-74.01713445&destination=40.7153379,-74.01658354&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72456089,-73.99565293&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72740794,-73.98142006&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7153379,-74.01658354&destination=40.72043411,-74.01020609&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76312584,-73.96526895&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes.subscribers

# =============================[most used routes by customers]================================

# declare variables
elements <- 20
origin <- c()
dest <- c()

# get routes
for(i in 1:elements){
  origin <- append(origin, paste(customer_routes$start.lat[i], customer_routes$start.lon[i], sep=","))
  dest <- append(dest, paste(customer_routes$end.lat[i], customer_routes$end.lon[i], sep=","))
}

# initialize routes map
map.routes.customers <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))

# build routes map
for(i in 1:(elements)){
  k <- (i - 1) / elements
  map.routes.customers <- map.routes.customers +
    geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
              color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
  # to deal with Google API request limit
  Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.77282817,-73.96685276&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.70277159,-73.99383605&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76915505,-73.98191841&destination=40.76915505,-73.98191841&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.69597683,-73.99014892&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76087502,-74.00277668&destination=40.71754834,-74.01322069&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.77282817,-73.96685276&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.78472675,-73.96961715&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.69991755,-73.98971773&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes.customers

# =============================[station surplus]================================

# build surplus stations map
map.surplus <- ggmap(basemap, base_layer = ggplot(stations.surplus, aes(x = lon, y = lat))) +
  geom_point(aes(color = surplus, size = surplus), alpha = .8) +
  scale_color_gradient(low = "green", high = "darkgreen") +
  scale_size(range = c(0.5, 10))

# build deficit stations map
map.deficit <- ggmap(basemap, base_layer = ggplot(stations.deficit, aes(x = lon, y = lat))) +
  geom_point(aes(color = surplus, size = surplus), alpha = .8) +
  scale_color_gradient(low = "darkred", high = "red") +
  scale_size(range = c(30, 1))

# display surplus and deficit maps
map.surplus

map.deficit

Business Issues

Stations running out of bikes is a big problem. The client would want to know which stations are candidates for improving bike storage capacity.

# The stations that are start stations the most number of times are likely the best candidates for a higher bike storage capacity
orderedByMostIncomingTraffic <- stationData[order(stationData$outTraffic),]
head(orderedByMostIncomingTraffic)
##     station.id start.freq end.freq inTraffic outTraffic mean.trip.duration
## 460       3230       5634     3164      2470      -2470           912.0309
## 466       3236       2340     1134      1206      -1206          1413.0620
## 286        519      11267    10232      1035      -1035           916.9943
## 284        517       5008     4168       840       -840           828.6024
## 278        511       4603     3978       625       -625           805.1271
## 168        391       1640     1182       458       -458          1724.0012

A more nuanced approach would be to find the number of total bikes, theoretically split them up equally among all stations, and see which stations need a larger capacity based on each arrival/departure of a bike.

# Find number of unique bike ids
bikeCapacity <- floor(nrow(as.data.frame(table(citiBike$bikeid))) / nrow(stationData))
orderedByMostIncomingTraffic <- stationData[order(stationData$outTraffic),]

# Show station that had most bikes leave than arrive; show station that had most bikes arrive than leave. Due to memory restraints, we cannot show all stations. If we could, we would iterate through every station in orderedByMostIncomingTraffic and show the plot for each station.
stId <- 1
for(i in 1:2)
{
  stationId <- orderedByMostIncomingTraffic[stId,1]
  stationInvolved <- citiBike[(citiBike$start.station.id == stationId | citiBike$end.station.id == stationId),]
  numDoubleStations <- nrow(citiBike[(citiBike$start.station.id == stationId & citiBike$end.station.id == stationId),])
  
  # vectors to hold capacity at a certain time
  timeV <- character(nrow(stationInvolved) + numDoubleStations)
  capacityV <- integer(nrow(stationInvolved) + numDoubleStations)
  # initial values
  timeV[1] <- "11/1/2015 00:00:00"
  capacityV[1] <- bikeCapacity
  timeVIndex <- 2
  capacityVIndex <- 2
  currCapacity = bikeCapacity
  # iterate through everytime the station is either a start or end station
  for(j in 1:nrow(stationInvolved))
  {
    currRow <- stationInvolved[j,]
    if(currRow$start.station.id == stationId)
    {
      currCapacity <- currCapacity - 1
      timeV[timeVIndex] <- toString(currRow$starttime)
      capacityV[capacityVIndex] <- currCapacity
      timeVIndex <- timeVIndex + 1
      capacityVIndex <- capacityVIndex + 1
    }
    if(currRow$end.station.id == stationId)
    {
      currCapacity <- currCapacity + 1
      timeV[timeVIndex] <- toString(currRow$stoptime)
      capacityV[capacityVIndex] <- currCapacity
      timeVIndex <- timeVIndex + 1
      capacityVIndex <- capacityVIndex + 1
    }
  }

  timeVsCapacity <- data.frame(timeV, capacityV)
  colnames(timeVsCapacity) = c("Time", "Capacity")
  timeVsCapacity$Time <- factor(timeVsCapacity$Time, levels=unique(as.character(timeVsCapacity$Time)))
  
  plot(timeVsCapacity$Time, timeVsCapacity$Capacity, main=stationId, xlab="Time", ylab="Number of bikes at station", type="n")
  lines(timeVsCapacity$Time, timeVsCapacity$Capacity, type="o")
  
  stId <- 475
}

There are clearly stations that are start stations much more frequently than end stations, and vice versa. By identifying these outlier stations, CitiBike can increase/decrease bike capacity at certain stations to better fit demand. In addition, the analysis conducted did not assume CitiBike redistributes bikes at the end of the day. In reality, CitiBike does this, but the analysis can help CitiBike identify which stations to take bikes from and which stations to add bikes to.

Bike maintenance bills are piling up. Client thinks that this is because some bikes are being used a lot more than other bikes. Can you check on this assumption?

First failed approach: tried to get information on frequency that each Bike ID is used as a way to gague demand. I realized though that this wouldn’t lead to any substantive recommendations because there would still be lots of unanswered questions: for example, one bike may be rented a lot but the trip duration for each ride is extremely short. So the code is below but my second attempt centers around trip duration which felt more acurrate.

if (FALSE) {
  # Condensed data frame of just bikeID and frequency it's used 
  demand <- citiBike %>%
    group_by(bikeid) %>%
    select(bikeid) %>%
    summarise(
      count = count(citiBike, "bikeid")
  )
  #rideTime = sum(citiBike$tripduration, na.rm = TRUE) THIS LINE IS GIVING ME ISSUES 
  # trying to sort the data 
  # don't think this is working though 
  sortedDemand <- demand[order(-demand$count),]
  head(sortedDemand)

  # plotting to see frequnecy patters 
  plot(sortedDemand) # looks the same as below 
  plot(demand, main = "Rental Frequencies by Bike ID", xlab = "Bike ID", ylab = "Frequency of Renting")
  # this shows that the bikes within the 22000 - 24000 bike IDs have higher frequency of usage so yes it looks like there perhaps are    some bikes that are used more significantly than others 

  max(citiBike$bikeid)
  min(citiBike$bikeid)
  # range of bike IDs is from 14529 - 24765 
}

Second attempt approach: I felt that this led to more tangible recommendations and less unanswered questions. My approach was to sum up trip durations by bike ID, leading to a list of bikes that are clearly more used than others. My graph features a blue line indicating the average trip duration and the red line indicates median trip duration. These two lines are relatively close to each other.

# summing trip durations and grouping by BikeID
tripSum <- tapply(citiBike$tripduration, citiBike$bikeid, sum)
head(tripSum)
##  14529  14530  14531  14532  14533  14534 
##  89537  31966  34212  39165 133242  49372
tripDurations <- as.data.frame(as.table(tripSum))
names(tripDurations) <- c("bikeID", "minutes")

#sorted by most minutes on bike 
tripDurations <- tripDurations[order(-tripDurations$minutes),]

# visualizing the information 
barplot(height = tripDurations[1:100,2],xlab = "Bike ID", ylab = "Total Time on Bike by minutes", main = "Top 200 Bike Riding Time Totals")

# Blue Line Indicates Averages 
abline(h = mean(tripDurations$minutes, na.rm = TRUE), col = "blue")

# Red Line Indicates Median 
abline(h = median(tripDurations$minutes, na.rm = TRUE), col = "red")

##### Clearly, of the tens of thousands of CitiBikes, there is an upper echelon of bikes that have significantly higher total ride times. They have been identified by bikeID in the data frame “tripDurations”. Our recommendation would be to target those top 100-200 bikes with more frequent maintenance in order to overall drive down unnecessary follow up maintenance costs.

In order to identify the least used stations by both younger folks (age < 25) and tourists (customers but not subscribers) the following analysis was done. This focuses on where these specific demographics start their trips in order to identify what the least used stations by these two target groups are during November 2015.

#Filter users born after 1991 into a separate data frame
young.customers <- subset(citiBike, citiBike$birth.year > 1991)
young.startStation <- as.data.frame(table(young.customers$start.station.id))
#Sort data frame of frequency of use of certain bike stations in increasing order:
sorted.young <- young.startStation[order(young.startStation$Freq),]
colnames(sorted.young) <- c("Start Station ID","# of times used")
print(head(sorted.young, n=10),row.names=FALSE)
##  Start Station ID # of times used
##              3052               1
##              3127               1
##              3221               1
##               418               2
##              3044               2
##              3089               2
##              3117               2
##               233               3
##               436               3
##              3043               3
#Filter users that are customers and not subscribers into a separate data frame
tourist.customers <- subset(citiBike,citiBike$usertype == "Customer")
tourist.startStation <- as.data.frame(table(tourist.customers$start.station.id))
#Sort data frame of frequency of use of certain bike stations in increasing order:
sorted.tourist <- tourist.startStation[order(tourist.startStation$Freq),]
colnames(sorted.tourist) <- c("Start Station ID","# of times used")
print(head(sorted.tourist, n=10),row.names=FALSE)
##  Start Station ID # of times used
##              2005               1
##              3054               2
##               218               5
##              3059               5
##              3049               8
##              3221               8
##              3070              10
##              3048              11
##              3053              11
##              3061              11
The analysis above identifies which stations are least used by users under 25 and users that do not have a subscription. This focused on the start station ID rather than the end station ID. This relies on the assumption that an individual’s willingness to use CitiBike is predicated more on if they can rent a bike initially, rather than drop it at a convenient place after their trip (worst case they can bring the bike to their starting point).

Bike Usage: Weekdays versus Weekends

Key question: Is there a difference in peak bike usage on weekends and weekdays?

# Separate the bike data into weekends versus weekdays
citiBike$day = format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%u")
citiBikeWeekday <- citiBike[citiBike$day < 6,]
citiBikeWeekend <- citiBike[citiBike$day >= 6,]
citiBikeWeekday$hour = strtoi(format(as.POSIXct(citiBikeWeekday$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBikeWeekend$hour = strtoi(format(as.POSIXct(citiBikeWeekend$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)

# Get the counts of number of rides for each hour
weekdayStart <- as.data.frame(table(citiBikeWeekday$hour))
weekendStart <- as.data.frame(table(citiBikeWeekend$hour))

# Plot 
barplot(weekdayStart$Freq, names.arg = weekdayStart$Var1, main = "Weekday CitiBike Usage by Time", xlab = "Time (Hour)", ylab = "Number of Users")

barplot(weekendStart$Freq, names.arg = weekendStart$Var1, main = "Weekend CitiBike Usage by Time", xlab = "Time (Hour)", ylab = "Number of Users")

From these graphs, we can see that on weekdays, peak bike usage occurs around 8 to 9 am and around 5 to 6 pm, which are the times during which most people are travelling to and from work. On weekends on the other hand, bike usage plateaus from noon to 4 pm. From these graphs, we can identify peak demand hours on weekdays and weekends and ensure that bikes are being properly distributed before and during these times so customers can easily locate bikes.

User Types and Gender

Additional analysis: Genders within User Types
Key question: Is there a big shift within the gender of both our customers and our subscribers?

genders <- citiBike %>%
  group_by(usertype) %>%
  select(gender) 

genderTable <- table(genders)

# 1 is Male
# 2 is Female

# We see that that there is no gender information for customers, so the barplots will illustrate gender 
# discrepencies within Subscribers. 


labels = c("unidentified", "male", "female")
barplot(genderTable, main = "Breakdown of Genders within Subscribers", xlab = "Gender", ylab = "Frequency", names.arg = labels)

Based on this barplot, we see that within the subscribers there is a much higher number of males than females. This can be very useful for the marketing team as they think about how to segment and target the key demographics, especially for a user type like subscriber which is a long-term relationship CitiBike strives to maintain. This can also lead to key questions like if there are perhaps lifestyle reasons for why women do not utilize the bike share service as much, for example, having to wear skirts to the office place. Perhaps CitiBike can do more to appeal to female subscribers by also offering additional services like partnerships with local gyms where women can store a change of work clothes.